# -*- coding: utf-8 -*-
'''
Scrawl webpage from SINA most clicked, commentted, and shared
And save the result to postgresql

Ubuntu 14.04
Python 2.7.9
pgadmin 9.3

author@qiidee

'''
import urllib2
import urllib
import urlparse
import psycopg2
import time
import datetime
import socket
import sys
import json
from urllib2 import Request, urlopen, URLError, HTTPError
from dateutil.parser import parse


def _decode_list(data):
    rv = []
    for item in data:
        if isinstance(item, unicode):
            item = item.encode('utf-8')
        elif isinstance(item, list):
            item = _decode_list(item)
        elif isinstance(item, dict):
            item = _decode_dict(item)
        rv.append(item)
    return rv

def _decode_dict(data):
    rv = {}
    for key, value in data.iteritems():
        if isinstance(key, unicode):
            key = key.encode('utf-8')
        if isinstance(value, unicode):
            value = value.encode('utf-8')
        elif isinstance(value, list):
            value = _decode_list(value)
        elif isinstance(value, dict):
            value = _decode_dict(value)
        rv[key] = value
    return rv


def _retrieve_click(nid, top_time, top_num):
    '''
    retrieve record from database for checking duplicates
    '''
    conn_string = "host='localhost' dbname='acdb' user='postgres' password='postgres'"
    # print the connection string we will use to connect
    #print "Retrieve records: connecting to database\n    ->%s" % (conn_string)
 
    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)
 
    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()
 
    select_sql = "SELECT * FROM sinaclick WHERE id = %s and top_time = \'%s\' and top_num = %s " % (nid, top_time, top_num)
    cursor.execute(select_sql)
 
    # retrieve the records from the database
    records = cursor.fetchall()
 
    return records

def _retrieve_comment(nid, top_time, top_num):
    '''
    retrieve record from database for checking duplicates
    '''
    conn_string = "host='localhost' dbname='acdb' user='postgres' password='postgres'"
    # print the connection string we will use to connect
    #print "Retrieve records: connecting to database\n    ->%s" % (conn_string)
 
    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)
 
    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()
 
    select_sql = "SELECT * FROM sinacomment WHERE id = %s and top_time = \'%s\' and top_num = %s " % (nid, top_time, top_num)
    cursor.execute(select_sql)
 
    # retrieve the records from the database
    records = cursor.fetchall()
 
    return records

def _retrieve_share(nid, top_time, top_num):
    '''
    retrieve record from database for checking duplicates
    '''
    conn_string = "host='localhost' dbname='acdb' user='postgres' password='postgres'"
    # print the connection string we will use to connect
    #print "Retrieve records: connecting to database\n    ->%s" % (conn_string)
 
    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)
 
    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()
 
    select_sql = "SELECT * FROM sinashare WHERE id = %s and top_time = \'%s\' and top_num = %s " % (nid, top_time, top_num)
    cursor.execute(select_sql)
 
    # retrieve the records from the database
    records = cursor.fetchall()
 
    return records

def _insert_record(sql):

    conn_string = "host='localhost' dbname='acdb' user='postgres' password='postgres'"
    # print the connection string we will use to connect
    #print "Insert records: connecting to database to insert \n    ->%s" % (conn_string)

    # get a connection, if a connect cannot be made an exception will be raised here
    conn = psycopg2.connect(conn_string)

    # conn.cursor will return a cursor object, you can use this cursor to perform queries
    cursor = conn.cursor()

    cursor.execute(sql)
    conn.commit()



def is_number(s):
    '''
    Determine whether a string is a number 
    '''
    try:
        int(s)
        return True
    except ValueError:
        return False

def _read_click(params):
    urllink = urllib.urlencode(params)
    #print urllink
    baseurl = 'http://top.news.sina.com.cn/ws/GetTopDataList.php?'
    link = baseurl + urllink
    req = urllib2.Request(link)
    handler = urllib2.urlopen(req, timeout=40)
    html = handler.read()
    # remove the variable name and extra space
    query = html[12:-2]
    data1 = json.loads(query, object_hook=_decode_dict)
    news = data1['data']
    numNews = len(news)
    if int(topshowNum) > numNews:
        print "number of news missing is..."
        print (int(topshowNum) - numNews)
    for n in range(numNews):
        newsitem = news[n]
        keys = newsitem.keys()
        keystring = ', '.join(keys)
        #print keystring
        valuestring = ''
        for key in keys:
            value = newsitem[key]
            if isinstance(value, basestring):
                value = value.replace('\'', '_')
                value = value.replace('\"', '__')
                
            if key == 'top_num':
                value = value.replace(',', '')
                top_num = value
                valuestring = valuestring + value + ','
            elif key == 'top_time':
                top_time = value
                valuestring = valuestring + '\'' + value + '\'' + ','
            elif key == 'ext1':
                if value:
                    valuestring = valuestring + value + ','
                else:
                    # value = 'f'
                    valuestring = valuestring + '\'' + 'f' + '\'' + ','
            elif key == 'id':
                nid = value
                valuestring = valuestring + value + ','
            else:
                if isinstance(value, bool):
                    valuestring = valuestring + '\'' + str(value) + '\'' + ','
                else:
                    valuestring = valuestring + '\'' + value + '\'' + ','
        valuestring = valuestring[:-1]
        insert_sql = 'INSERT INTO sinaclick (' + keystring + ') VALUES (' + valuestring + ')'
        record = _retrieve_click(nid, top_time, top_num)
        if len(record) > 0:
            print "duplicate!"
            print (nid, top_time, top_num)
        else:
            try:
                _insert_record(insert_sql)
            except:
                print insert_sql
#                raise
    
def _read_comment(params):
    urllink = urllib.urlencode(params)
    #print urllink
    baseurl = 'http://top.news.sina.com.cn/ws/GetTopDataList.php?'
    link = baseurl + urllink
    req = urllib2.Request(link)
    handler = urllib2.urlopen(req, timeout=40)
    html = handler.read()
    query = html[12:-2]
    data1 = json.loads(query, object_hook=_decode_dict)
    #print data1
    news = data1['data']
    numNews = len(news)
    if int(topshowNum) > numNews:
        print "number of news missing is..."
        print (int(topshowNum) - numNews)
    for n in range(numNews):
        newsitem = news[n]
        keys = newsitem.keys()
        keystring = ', '.join(keys)
        valuestring = ''
        for key in keys:
            value = newsitem[key]
            #print key
            #print value
            if isinstance(value, basestring):
                value = value.replace('\'', '_')
                value = value.replace('\"', '__')
                
            if key == 'top_num':
                value = value.replace(',', '')
                top_num = value
                valuestring = valuestring + value + ','
            elif key == 'top_time':
                top_time = value
                valuestring = valuestring + '\'' + value + '\'' + ','
            elif key == 'ext5':
                if value == False:
                    valuestring = valuestring + '\'false_value\'' + ','
                else:
                    valuestring = valuestring + '\'' + value + '\'' + ','
            elif key == 'id':
                nid = value
                valuestring = valuestring + value + ','
            else:
                valuestring = valuestring + '\'' + value + '\'' + ','
        valuestring = valuestring[:-1]
        insert_sql = 'INSERT INTO sinacomment (' + keystring + ') VALUES (' + valuestring + ')'
        record = _retrieve_comment(nid, top_time, top_num)
        if len(record) > 0:
            print "duplicate!"
            print (nid, top_time, top_num)
        else:
            _insert_record(insert_sql)
        
def _read_share(params):
    urllink = urllib.urlencode(params)
    baseurl = 'http://top.news.sina.com.cn/ws/GetTopDataList.php?'
    link = baseurl + urllink
    req = urllib2.Request(link)
    handler = urllib2.urlopen(req, timeout=40)
    html = handler.read()
    query = html[12:-2]
    data1 = json.loads(query, object_hook=_decode_dict)
    #print data1
    try:
        news = data1['data']
        numNews = len(news)
        if int(topshowNum) > numNews:
            print "number of news missing is..."
            print (int(topshowNum) - numNews)
        for n in range(numNews):
            newsitem = news[n]
            keys = newsitem.keys()
            keystring = ', '.join(keys)
            #print keystring
            valuestring = ''
            for key in keys:
                value = newsitem[key]
                if isinstance(value, basestring):
                    value = value.replace('\'', '_')
                    value = value.replace('\"', '__')
                    
                if key == 'top_num':
                    value = value.replace(',', '')
                    top_num = value
                    valuestring = valuestring + value + ','
                elif key == 'top_time':
                    top_time = value
                    valuestring = valuestring + '\'' + value + '\'' + ','
                elif key == 'ext1':
                    if value == False:
                        valuestring = valuestring + '\'false_value\'' + ','
                    else:
                        valuestring = valuestring + '\'' + value + '\'' + ','
                elif key == 'id':
                    nid = value
                    valuestring = valuestring + value + ','
                else:
                    valuestring = valuestring + '\'' + value + '\'' + ','
            valuestring = valuestring[:-1]
            insert_sql = 'INSERT INTO sinashare (' + keystring + ') VALUES (' + valuestring + ')'
            #print insert_sql
            record = _retrieve_share(nid, top_time, top_num)
            if len(record) > 0:
                print "duplicate!"
                print (nid, top_time, top_num)
            else:
                _insert_record(insert_sql)
    except: # no data returned
        pass
               
                    

if __name__ == '__main__':
    ##    ## link inside sina
##    # http://top.news.sina.com.cn/ws/GetTopDataList.php?top_type=day&top_cat=society&top_time=20140126&top_show_num=20&top_order=ASC&js_var=news_
##    # top_cat: news categories
##    # www_all, 新闻总排行 点击量排行  top_order: ASC
##    # qbpdpl, 全部频道评论 top_num: comment number  ext2: picture link
##    # wbrmzf_qz, sphhzpx,
##    # www_slide_image, china, gnxwpl, wbrmzfgnxw,
##    # world, gjxwpl, wbrmzfgwxw, society, shxwpl,
##    # wbrmzfshxw, all, tyxwpl, wbrmzfty,cjxwpl, wbrmzfcj, ylxwpl, kjxwpl
##    # wbrmzfkj, jsxwpl, wbrmzfjsxw, wbrmzfyl, ylxwpl, wbrmzfcj, cjxwpl
##    #
##    #
##    # top_order: ranking order
##    # DESC, ASC
##    #

    params = {'top_type':'day',
              'top_cat':'qbpdpl',
              # 'top_time':'20140126',
              'top_show_num':'100',
              'top_order':'ASC',
              'js_var':'news_'}
    top_cat = ['www_all', 'qbpdpl', 'wbrmzf_qz', 'sphhzpx',
               'www_slide_image', 'china', 'gnxwpl', 'wbrmzfgnxw',
               'world', 'gjxwpl', 'wbrmzfgwxw', 'society', 'shxwpl',
               'wbrmzfshxw', 'all', 'tyxwpl', 'wbrmzfty','cjxwpl',
               'wbrmzfcj', 'ylxwpl', 'kjxwpl', 'wbrmzfkj', 'jsxwpl',
               'wbrmzfjsxw', 'wbrmzfyl', 'ylxwpl', 'wbrmzfcj', 'cjxwpl']
    
    topshowNum = params['top_show_num']
    # startdate
    startDate = datetime.date(2017,12,22)
    watchPeriod = 30
    # watchPeriod = 320
    for day in range(watchPeriod):
        newsDate = startDate + datetime.timedelta(day)
        newsDate = newsDate.strftime('%Y%m%d')
        print newsDate
        params['top_time']=newsDate
        #print params
        for cat in range(5,8):
            if cat == 5:
                # news category - most clicked
                chinaclick = top_cat[5]
                params['top_cat'] = chinaclick
                params['top_order'] = 'ASC'
                print(params)
                try:
                    _read_click(params)
                except (urllib2.HTTPError,urllib2.URLError) as e:
                    time.sleep(20)
                    try:
                        _read_click(params)
                    except:
                        print "unable to reach the click page..."
                        print day
            elif cat == 6:
                # 国内新闻评论  ext2: pic link ext3: unknow number
                # ext5: subheading  top_num: number of comments
                chinacomment = top_cat[6]
                params['top_cat'] = chinacomment
                params['top_order'] = 'DESC'
                try:
                    _read_comment(params)
                except (urllib2.HTTPError,urllib2.URLError) as e:
                    time.sleep(20)
                    try:
                        _read_comment(params)
                    except:
                        print "unable to reach the comment page..."
                        print day


    print "it's done!"
